/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_RefinementDetails.csv (downloaded 5/22/2018)
			pdb_RefinementParameters.csv (downloaded 5/22/2018)

Outputs:	clean_refine.dta

*******************************************************************************/

clear all

import delimited using "${data_raw}PDB/pdb_RefinementDetails.csv", clear

* Rename some variables
rename structureid structureId
rename robserved rObservered
rename rall rAll
rename rwork rWork
rename rfree rFree
rename averagebfactor averageBFactor
rename refinementresolution refinementResolution
	
tempfile details
save `details'
	
* Add on additional data
import delimited using "${data_raw}PDB/pdb_RefinementParameters.csv", clear	
	
sort structureid highresolutionlimit structuredeterminationmethod 
duplicates drop structureid, force

* Rename / clean up some variables
rename structureid structureId
rename structuredeterminationmethod determineMethod
replace determineMethod = lower(determineMethod)
	
keep structureId determineMethod
	
format determineMethod %50s
	
* Generate dummy for molecular replacement
gen molecularReplacement = 													///
	inlist(determineMethod,"molecular replacement", "mr", "molrep", "mr, mr")
replace molecularReplacement = . if determineMethod == ""
	
merge 1:1 structureId using `details', nogen
	
save "${data_clean}/clean_refine.dta", replace


